#!/bin/bash
set -x


export VLLM_ENGINE_ITERATION_TIMEOUT_S=36000
export VLLM_RPC_TIMEOUT=36000000


XINFERENCE_MODEL_SRC=modelscope XINFERENCE_HOME=/xinference/xinference_cache xinference-local -H 0.0.0.0 -p 8000 &
while true; do
  if curl -s "http://localhost:8000" > /dev/null; then
    break
  else
    sleep 1
  fi
done


xinference launch --model-engine vllm -e "http://0.0.0.0:8000" --model-name qwen2.5-vl-instruct --size-in-billions 32 --model-format pytorch --model-format awq --model_quantization awq --model_path /home/weights/Qwen/Qwen2.5-VL-32B-Instruct-AWQ  --n-gpu 4 --replica 1 --max_model_len $[1024*32] --trust-remote-code 1 --limit_mm_per_prompt '{"image":8,"video":0}' --mm_processor_kwargs '{"min_pixels":200704, "max_pixels":1003520}'

# xinference launch --model-engine vllm -e "http://0.0.0.0:8000" --model-name qwen2.5-vl-instruct --size-in-billions 32 --model-format pytorch --model-format awq --model_quantization awq --model_path /home/weights/Qwen/Qwen2.5-VL-32B-Instruct-AWQ  --n-gpu 2 --replica 2 --max_model_len $[1024*32] --trust-remote-code 1 --limit_mm_per_prompt '{"image":4,"video":0}' --mm_processor_kwargs '{"min_pixels":200704, "max_pixels":1003520}'

PID1=$!
wait $PID1
wait


